import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import pandas_profiling
pd.set_option('display.max_columns',None)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.stats import zscore
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
for f in os.listdir():
print(f.ljust(30) +"--" + str(round(os.path.getsize(f) / 1000000, 2)) + 'MB')
#No Additional files present apart from data set
df=pd.read_csv('vehicle.csv')
df.shape
#contains 864 rows and 19 columns
df.head()
df.tail()
#Intial look on table columns and types and non null counts
df.info()
#understading the data
df.describe(include='all').T
#identifying null values
df.isnull().sum()
#replace null values as nan
df = df.replace(' ', np.nan)
#Replacing the missing values by median
# median are more robust towards the data distribution compared to mean hence selected median for replacement
for i in df.columns[:17]:
median_value = df[i].median()
df[i] = df[i].fillna(median_value)
df.isnull().sum()
#revalidating info - for post changes details
df.info()
#understanding outliers
df.boxplot(figsize=(35,20))
#understanding distribution before the outlier treatment
df.hist(figsize=(35,20));
#- selecting all but leaving out the last columns whihc is our target
for cols in df.columns[:-1]:
q1=df[cols].quantile(0.25)
q3=df[cols].quantile(0.75)
iqr=q3-q1
low=q1-1.5*iqr
high=q3+1.5*iqr
df.loc[(df[cols]<low),cols]=low
df.loc[(df[cols]>high),cols]=high
#reevaluate the box plot for outliers
df.boxplot(figsize=(35,20))
#understanding Dependent column
sns.countplot(df['class']);
plt.show()
print(df['class'].value_counts())
#have to use label encoding for class to get a view on correlation
#using labelencoder and not one hot encoding for not increasing classes at this point
from sklearn.preprocessing import LabelEncoder
lblEncoder=LabelEncoder()
df['class']=lblEncoder.fit_transform(df['class'])
cor_df=df.corr()
fig,ax=plt.subplots(figsize=(15,15));
sns.heatmap(cor_df,annot=True);
#Viewing all details
sns.pairplot(df)
#viewing with class distribution on each graph
sns.pairplot(df, hue='class')